{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "raw_docs = [\"Here are some very simple basic sentences.\",\n",
    "\"They won't be very interesting, I'm afraid.\",\n",
    "\"The point of these examples is to _learn how basic text cleaning works_ on *very simple* data.\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences', '.'], ['They', 'wo', \"n't\", 'be', 'very', 'interesting', ',', 'I', \"'m\", 'afraid', '.'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', '_learn', 'how', 'basic', 'text', 'cleaning', 'works_', 'on', '*very', 'simple*', 'data', '.']]\n"
     ]
    }
   ],
   "source": [
    "# Tokenizing text into bags of words\n",
    "from nltk.tokenize import word_tokenize\n",
    "tokenized_docs = [word_tokenize(doc) for doc in raw_docs]\n",
    "print(tokenized_docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'be', 'very', 'interesting', 'I', 'm', 'afraid'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', 'learn', 'how', 'basic', 'text', 'cleaning', 'works', 'on', 'very', 'simple', 'data']]\n"
     ]
    }
   ],
   "source": [
    "# Removing punctuation\n",
    "import re\n",
    "import string\n",
    "regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n",
    "\n",
    "tokenized_docs_no_punctuation = []\n",
    "\n",
    "for review in tokenized_docs:\n",
    "    new_review = []\n",
    "    for token in review:\n",
    "        new_token = regex.sub(u'', token)\n",
    "        if not new_token == u'':\n",
    "            new_review.append(new_token)\n",
    "    \n",
    "    tokenized_docs_no_punctuation.append(new_review)\n",
    "    \n",
    "print(tokenized_docs_no_punctuation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['Here', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'interesting', 'I', 'afraid'], ['The', 'point', 'examples', 'learn', 'basic', 'text', 'cleaning', 'works', 'simple', 'data']]\n"
     ]
    }
   ],
   "source": [
    "# Cleaning text of stopwords\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "tokenized_docs_no_stopwords = []\n",
    "\n",
    "for doc in tokenized_docs_no_punctuation:\n",
    "    new_term_vector = []\n",
    "    for word in doc:\n",
    "        if not word in stopwords.words('english'):\n",
    "            new_term_vector.append(word)\n",
    "    \n",
    "    tokenized_docs_no_stopwords.append(new_term_vector)\n",
    "\n",
    "print(tokenized_docs_no_stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['here', 'simpl', 'basic', 'sentenc'], ['they', 'wo', 'nt', 'interest', 'I', 'afraid'], ['the', 'point', 'exampl', 'learn', 'basic', 'text', 'clean', 'work', 'simpl', 'data']]\n"
     ]
    }
   ],
   "source": [
    "# Stemming and Lemmatizing\n",
    "from nltk.stem.porter import PorterStemmer\n",
    "from nltk.stem.snowball import SnowballStemmer\n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "\n",
    "porter = PorterStemmer()\n",
    "snowball = SnowballStemmer('english')\n",
    "wordnet = WordNetLemmatizer()\n",
    "\n",
    "preprocessed_docs = []\n",
    "\n",
    "for doc in tokenized_docs_no_stopwords:\n",
    "    final_doc = []\n",
    "    for word in doc:\n",
    "        final_doc.append(porter.stem(word))\n",
    "        #final_doc.append(snowball.stem(word))\n",
    "        #final_doc.append(wordnet.lemmatize(word))\n",
    "    \n",
    "    preprocessed_docs.append(final_doc)\n",
    "\n",
    "print(preprocessed_docs)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
